library(tidyverse)
library(magrittr)
library(tximeta)
library(DESeq2)
library(UpSetR)
library(pheatmap)
library(RColorBrewer)
library(ggfortify)
library(glmnet)
library(EnhancedVolcano)
library(umap)
library(biomaRt)
#library(ggbeeswarm)
This markdown is based on this vignette and describes the processing of the FASTQ files from the RNAseq data of Fleischer et al., as well as performing a differential expression analysis. The differentially expressed genes are then used for building Steiner trees.
The FASTQ files from the study from Fleischer et al. were downloaded from the European Nucleotide Archive ENA. The used accession numbers were: SRR7093809 - SRR7093951
For the transcript quantification the tool salmon was used according to this vignette. For the quantification, first a reference transcriptome is needed for the alignment. A human reference transcriptome can be downloaded from ensemblgenomes. Release 105 was used instead of the newest one 106 because tximeta only works for versions up to 105 at the moment. Then salmon was used to build an index on that transcriptome (salmon index -t ../genome_data/Homo_sapiens.GRCh38.cdna.all.fa.gz -i grch38_index). Finally, the samples were quantified with: salmon quant -i grch38_index -l A -r /mnt/jana/fastq/{sample}.fastq.gz -p 8 –validateMappings -o quants/{sample}_quant. This command produces an output folder for each sample that contains the quant.sf file, which includes the count values, used in the following.
# Definition of the age groups
age_groups <- data.frame(Age = as.character(seq(1, 96, 1)),
age_group = c(rep('1-15', 15), rep('16-26', 11),
rep('27-60', 34), rep('61-85', 25),
rep('86-96', 11)))
# get info about the runs
runs_info <- read.delim(paste0(dir, "Data/SraRunTable.txt"), sep = ",") %>%
mutate(Age = gsub("y[rs]\\d+mos", "", Age)) %>%
mutate(Age = gsub("yr", "", Age)) %>%
left_join(age_groups) %>%
mutate(age_group = factor(age_group, levels = unique(age_groups$age_group))) %>%
filter(disease == "Normal")
## Joining, by = "Age"
#filter(sex == "male")
head(runs_info, n = 3)
## Run Age Assay.Type AvgSpotLen Bases BioProject BioSample
## 1 SRR7093810 12 RNA-Seq 75 1538664000 PRJNA454681 SAMN09011820
## 2 SRR7093812 25 RNA-Seq 75 1495228725 PRJNA454681 SAMN09011818
## 3 SRR7093815 26 RNA-Seq 75 1595644800 PRJNA454681 SAMN09011815
## Bytes cell_id Center.Name Consent DATASTORE.filetype DATASTORE.provider
## 1 574146143 AG16409 GEO public sra,fastq ncbi,gs,s3
## 2 540157644 AG09975 GEO public fastq,sra gs,s3,ncbi
## 3 600307232 AG07124 GEO public sra,fastq gs,ncbi,s3
## DATASTORE.region disease Experiment GEO_Accession..exp.
## 1 s3.us-east-1,gs.US,ncbi.public Normal SRX4022457 GSM3124561
## 2 gs.US,s3.us-east-1,ncbi.public Normal SRX4022459 GSM3124563
## 3 gs.US,ncbi.public,s3.us-east-1 Normal SRX4022462 GSM3124566
## Instrument LibraryLayout LibrarySelection LibrarySource Organism
## 1 NextSeq 500 SINGLE cDNA TRANSCRIPTOMIC Homo sapiens
## 2 NextSeq 500 SINGLE cDNA TRANSCRIPTOMIC Homo sapiens
## 3 NextSeq 500 SINGLE cDNA TRANSCRIPTOMIC Homo sapiens
## Platform ReleaseDate Sample.Name sex SRA.Study source_name
## 1 ILLUMINA 2018-11-17T00:00:00Z GSM3124561 male SRP144355 Skin; Unspecified
## 2 ILLUMINA 2018-11-17T00:00:00Z GSM3124563 male SRP144355 Skin; Arm
## 3 ILLUMINA 2018-11-17T00:00:00Z GSM3124566 female SRP144355 Skin; Arm
## Ethnicity age_group
## 1 Caucasian 1-15
## 2 caucasian 16-26
## 3 Caucasian 16-26
runs_info$names <- runs_info$Run
runs_info$files <- paste0(dir, "Data/rna_counts/", runs_info$names, "_quant/quant.sf")
#file.exists(runs_info$files)
runs_info %<>% mutate(Age = as.numeric(Age)) %>%
mutate(Sex = ifelse(sex == 'male', 'Male', 'Female'))
ggplot(runs_info, aes(x = Age, fill = Sex)) +
geom_bar() +
theme_bw() +
xlab('Age') +
ylab('Number of individuals') +
theme(text = element_text(size = 18))
ggplot(runs_info, aes(x = age_group)) +
geom_bar() +
theme_bw() +
ylab('number of individuals') +
theme(text = element_text(size = 17))
# create summarized experiment with tximeta
se <- tximeta(runs_info)
## importing quantifications
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
## found matching transcriptome:
## [ Ensembl - Homo sapiens - release 105 ]
## loading existing EnsDb created: 2022-10-25 14:37:43
## loading existing transcript ranges created: 2022-10-25 14:37:49
# convert from ensembl transcript id to gene id
gse <- summarizeToGene(se)
## loading existing EnsDb created: 2022-10-25 14:37:43
## obtaining transcript-to-gene mapping from database
## loading existing gene ranges created: 2022-10-25 14:37:58
## summarizing abundance
## summarizing counts
## summarizing length
gse <- gse[rowData(gse)$gene_biotype == "protein_coding", ]
dim(assay(gse))
## [1] 21871 133
# filter for genes that have counts of at least 10 in at least 5 samples
keep <- rowSums(assay(gse) >= 10) >= 5
gse <- gse[keep,]
#keep2 <- rowMax(assay(gse)) /rowMin(assay(gse)) > 5
#gse <- gse[keep2,]
assay(gse)[1:5, 1:5]
## SRR7093810 SRR7093812 SRR7093815 SRR7093817 SRR7093819
## ENSG00000000003 357.013 450.912 588.386 557.761 592.659
## ENSG00000000419 622.399 648.336 690.053 635.352 836.884
## ENSG00000000457 218.196 179.125 191.908 226.061 272.195
## ENSG00000000460 121.005 256.007 357.007 358.488 543.658
## ENSG00000000971 175.001 367.989 369.000 624.000 177.999
dim(assay(gse))
## [1] 15337 133
dds <- DESeqDataSet(gse, design = ~ age_group)
## using counts and average transcript lengths from tximeta
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
# differential expression analysis
dds <- DESeq(dds)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 221 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
foldchanges <- lapply(seq(1, length(unique(age_groups$age_group))-1, 1), function(ix) {
t0 = unique(age_groups$age_group)[ix]
t1 <- unique(age_groups$age_group)[ix+1]
# calculate log2 foldchange between two age intervals
res <- results(dds, contrast=c("age_group", t1, t0))
res$gene <- rowData(dds)$symbol
res$transition <- paste0('fc_', t0, '_', t1)
res <- subset(res, select = c(gene, transition, log2FoldChange, pvalue, padj)) %>%
as.data.frame() %>%
rownames_to_column(var = "ensembl_id") %>%
filter(gene != "")
return(res)}) %>%
bind_rows() %>%
mutate(abs_log2_fc = abs(log2FoldChange)) %>%
arrange(desc(abs_log2_fc)) %>%
drop_na(padj)
head(foldchanges)
## ensembl_id gene transition log2FoldChange pvalue
## 1 ENSG00000154839 SKA1 fc_16-26_27-60 50.00218 3.105491e-68
## 2 ENSG00000228177 C6orf47 fc_16-26_27-60 42.60803 2.140504e-20
## 3 ENSG00000278259 MYO19 fc_61-85_86-96 38.90985 5.526890e-70
## 4 ENSG00000278259 MYO19 fc_27-60_61-85 -36.81495 2.630264e-86
## 5 ENSG00000276023 DUSP14 fc_61-85_86-96 33.20129 6.807138e-25
## 6 ENSG00000204592 HLA-E fc_61-85_86-96 32.25490 3.287560e-12
## padj abs_log2_fc
## 1 4.762580e-64 50.00218
## 2 6.565354e-17 42.60803
## 3 8.476039e-66 38.90985
## 4 4.033773e-82 36.81495
## 5 6.228426e-23 33.20129
## 6 3.429798e-11 32.25490
Save all considered genes (the ones with counts >= 10 in at least 5 samples) with ensembl id and gene symbol
all_genes <- subset(foldchanges, select = c(ensembl_id, gene)) %>%
distinct()
write.csv(all_genes, paste0(dir, 'Data/all_genes.csv'), row.names = FALSE)
group_by(foldchanges, transition) %>%
summarize("p<0.01" = length(which(padj < 0.01)),
"p<0.05" = length(which(padj < 0.05)),
"p<0.1" = length(which(padj < 0.1)),
"p<0.2" = length(which(padj < 0.2)))
## # A tibble: 4 × 5
## transition `p<0.01` `p<0.05` `p<0.1` `p<0.2`
## <chr> <int> <int> <int> <int>
## 1 fc_1-15_16-26 126 414 745 1545
## 2 fc_16-26_27-60 28 142 336 631
## 3 fc_27-60_61-85 48 161 327 834
## 4 fc_61-85_86-96 7034 8490 9308 10309
DE_subsampling <- read.delim(paste0(dir, "Data/subsampling_DE_0.1.csv"), sep = ",") %>%
group_by(transition, gene) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
group_by(transition) %>%
mutate(rank = rank(-n, ties.method = 'first'))
## `summarise()` has grouped output by 'transition'. You can override using the
## `.groups` argument.
names <- data.frame(transition = c("fc_1-15_16-26", "fc_16-26_27-60", "fc_27-60_61-85", "fc_61-85_86-96"),
group = c("Group 1 vs. Group 2", "Group 2 vs. Group 3", "Group 3 vs. Group 4", "Group 4 vs. Group 5"))
DE_subsampling %<>% left_join(names)
## Joining, by = "transition"
# How often does each gene occur in the simulations?
ggplot(DE_subsampling, aes(x = as.numeric(rank), y = n)) +
geom_line() +
facet_wrap(~group, scales = "free_x", nrow = 1) +
theme_bw() +
xlab("ranked genes") +
ylab("% subsamples \n with p < 0.1") +
theme(text = element_text(size = 18))
filter(DE_subsampling, n > 25) %>% summarize(n = n())
## # A tibble: 4 × 2
## transition n
## <chr> <int>
## 1 fc_1-15_16-26 630
## 2 fc_16-26_27-60 229
## 3 fc_27-60_61-85 260
## 4 fc_61-85_86-96 9381
# Filtering: such that around 200 robust & significant DE genes per transition
thresholds <- data.frame('transition' = c('fc_1-15_16-26', 'fc_16-26_27-60',
'fc_27-60_61-85', 'fc_61-85_86-96'),
'p_threshold' = c(0.05, 0.1, 0.1, 1e-17),
'n_threshold' = c(50, 25, 20, 99),
'fc_threshold' = c(0.6, 0.4, 0.4, 1.6))
foldchanges_filtered <- subset(foldchanges, select = -c(pvalue)) %>%
left_join(DE_subsampling, by = c('gene', 'transition')) %>%
left_join(thresholds) %>%
filter(padj < p_threshold & n > n_threshold & abs_log2_fc > fc_threshold)
## Joining, by = "transition"
group_by(foldchanges_filtered, transition) %>% summarize(n = n())
## # A tibble: 4 × 2
## transition n
## <chr> <int>
## 1 fc_1-15_16-26 182
## 2 fc_16-26_27-60 162
## 3 fc_27-60_61-85 163
## 4 fc_61-85_86-96 184
# Save the results for various thresholds on the adjusted p-value
p_0.05 <- foldchanges_filtered %>%
group_by(gene, transition) %>%
# mean for multiple ensembl-ids belonging to the same gene
summarize(log2FoldChange = mean(log2FoldChange)) %>%
mutate(abs_log2_fc = abs(log2FoldChange)) %>%
mutate(updown = ifelse(log2FoldChange > 0, 'up', 'down'))
## `summarise()` has grouped output by 'gene'. You can override using the
## `.groups` argument.
write.csv(p_0.05, paste0(dir, 'Data/DE_updown.csv'), row.names = FALSE)
write.csv(subset(p_0.05, select = c(gene, transition, abs_log2_fc)),
paste0(dir, "Data/DE_var_p_n_200.csv"), row.names = FALSE)
Select example genes with highest absolute log2 fold change for a given transition:
filter(foldchanges_filtered, transition == "fc_27-60_61-85") %>%
arrange(desc(abs_log2_fc)) %>%
head(n = 5)
## ensembl_id gene transition log2FoldChange padj abs_log2_fc
## 1 ENSG00000278259 MYO19 fc_27-60_61-85 -36.814945 4.033773e-82 36.814945
## 2 ENSG00000204227 RING1 fc_27-60_61-85 -25.978800 4.799849e-56 25.978800
## 3 ENSG00000157870 PRXL2B fc_27-60_61-85 -24.293853 1.065110e-28 24.293853
## 4 ENSG00000179772 FOXS1 fc_27-60_61-85 3.474460 4.540675e-02 3.474460
## 5 ENSG00000174697 LEP fc_27-60_61-85 3.338923 1.808030e-05 3.338923
## n rank group p_threshold n_threshold fc_threshold
## 1 99 4 Group 3 vs. Group 4 0.1 20 0.4
## 2 100 3 Group 3 vs. Group 4 0.1 20 0.4
## 3 91 12 Group 3 vs. Group 4 0.1 20 0.4
## 4 47 114 Group 3 vs. Group 4 0.1 20 0.4
## 5 51 98 Group 3 vs. Group 4 0.1 20 0.4
coldata <- as.data.frame(subset(colData(dds), select = c(names, Age, age_group, sex)))
count_data <- fpkm(dds) %>%
as.data.frame() %>%
rownames_to_column(var = "ensembl_id") %>%
mutate(gene = rowData(dds)$symbol) %>%
gather(key = "names", value = "counts", -c(ensembl_id, gene)) %>%
left_join(coldata, by = "names") %>%
filter(gene != "")
write.csv(count_data, paste0(dir, "Data/count_data.csv"), row.names = FALSE)
head(count_data)
## ensembl_id gene names counts Age age_group sex
## 1 ENSG00000000003 TSPAN6 SRR7093810 8.675290 12 1-15 male
## 2 ENSG00000000419 DPM1 SRR7093810 35.885730 12 1-15 male
## 3 ENSG00000000457 SCYL3 SRR7093810 3.334175 12 1-15 male
## 4 ENSG00000000460 C1orf112 SRR7093810 2.652753 12 1-15 male
## 5 ENSG00000000971 CFH SRR7093810 3.709251 12 1-15 male
## 6 ENSG00000001036 FUCA2 SRR7093810 53.758105 12 1-15 male
# Mean counts per age group
mean_counts <- count_data %>% group_by(ensembl_id, gene, age_group) %>%
summarize(counts = mean(counts)) %>%
ungroup()
## `summarise()` has grouped output by 'ensembl_id', 'gene'. You can override
## using the `.groups` argument.
write.csv(mean_counts, paste0(dir, "Data/mean_counts.csv"), row.names = FALSE)
head(mean_counts)
## # A tibble: 6 × 4
## ensembl_id gene age_group counts
## <chr> <chr> <fct> <dbl>
## 1 ENSG00000000003 TSPAN6 1-15 9.38
## 2 ENSG00000000003 TSPAN6 16-26 12.1
## 3 ENSG00000000003 TSPAN6 27-60 12.5
## 4 ENSG00000000003 TSPAN6 61-85 11.4
## 5 ENSG00000000003 TSPAN6 86-96 16.5
## 6 ENSG00000000419 DPM1 1-15 36.9
# get activities of all genes per age group (based on FPKM values)
fpkm <- fpkm(dds) %>%
as.data.frame() %>%
mutate(gene = rowData(dds)$symbol) %>%
rownames_to_column(var = "ensembl_gene_id") %>%
gather(key = "names", value = "counts", -c(ensembl_gene_id, gene)) %>%
left_join(coldata, by = "names") %>%
filter(gene != "") %>%
group_by(gene, age_group) %>%
summarize(counts = mean(counts)) %>%
mutate(activity = ifelse(counts > 0.8, 'active', 'inactive'))
## `summarise()` has grouped output by 'gene'. You can override using the
## `.groups` argument.
fpkm_plot <- fpkm %>%
mutate(age_group = recode(age_group, "1-15" = "Group 1", "16-26" = "Group 2",
"27-60" = "Group 3", "61-85" = "Group 4", "86-96" = "Group 5"))
ggplot(fpkm_plot, aes(x = log(counts+1))) +
geom_histogram(bins = 40) +
geom_vline(xintercept = log(0.8 + 1), color = "red") +
facet_wrap(~age_group, nrow = 1) +
theme_bw() +
theme(text = element_text(size = 22)) +
xlab('log(FPKM + 1)')
gene_activity <- data.frame(transition = c('fc_1-15_16-26', 'fc_16-26_27-60',
'fc_27-60_61-85', 'fc_61-85_86-96')) %>%
separate(transition, c('fc', 't0', 't1'), sep = "_", remove = FALSE) %>%
left_join(dplyr::rename(fpkm, t0 = age_group, a0 = activity, c0 = counts)) %>%
left_join(dplyr::rename(fpkm, t1 = age_group, a1 = activity, c1 = counts)) %>%
mutate(activity = paste0(a0, "_", a1)) %>%
dplyr::filter(activity != 'inactive_inactive') %>%
subset(select = c(transition, gene, activity))
## Joining, by = "t0"
## Joining, by = c("t1", "gene")
write.csv(gene_activity, paste0(dir, "Data/gene_activity.csv"), row.names = FALSE)
head(gene_activity)
## transition gene activity
## 1 fc_1-15_16-26 A1BG active_active
## 2 fc_1-15_16-26 A2M active_active
## 3 fc_1-15_16-26 A4GALT active_active
## 4 fc_1-15_16-26 AAAS active_active
## 5 fc_1-15_16-26 AACS active_active
## 6 fc_1-15_16-26 AADAT active_active
#split DE genes into list of groups according to the transitions
ages <- c("1-15", "16-26", "27-60", "61-85", "86-96")
top_list <- filter(fpkm, activity == 'active') %>%
mutate(age_group = factor(age_group, levels = ages)) %>%
group_by(age_group) %>%
group_split()
names(top_list) <- c("Group 1", "Group 2", "Group 3", "Group 4", "Group 5")
top_list <- lapply(top_list, function(list) list$gene)
upset(fromList(top_list), order.by = "freq", nsets = 9,
mainbar.y.label = "Intersections", sets.x.label = "Active Genes",
text.scale = c(3.5, 3.5, 3, 3, 3.5, 3.5), nintersects = 20,
keep.order= TRUE, sets = c("Group 5", "Group 4", "Group 3",
"Group 2", "Group 1"))
foldchanges %<>% left_join(names)
## Joining, by = "transition"
ggplot(foldchanges, aes(x = pvalue)) +
geom_histogram() +
facet_wrap(~group, scale = "free_y", nrow = 1) +
xlab('p-value') +
theme_bw() +
theme(text = element_text(size = 18))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#split DE genes into list of groups according to the transitions
transitions <- c("fc_1-15_16-26", "fc_16-26_27-60", "fc_27-60_61-85",
"fc_61-85_86-96")
top_list <- p_0.05 %>%
mutate(transition = factor(transition, levels = transitions)) %>%
group_by(transition) %>%
group_split()
names(top_list) <- c("Group 1 vs. Group 2", "Group 2 vs. Group 3", "Group 3 vs. Group 4", "Group 4 vs. Group 5")
top_list <- lapply(top_list, function(list) list$gene)
upset(fromList(top_list), order.by = "freq", nsets = 9,
mainbar.y.label = "Intersections", sets.x.label = "DE Genes",
text.scale = c(3.5, 3.5, 3, 3, 3.5, 3.5), nintersects = 20,
keep.order= TRUE, sets = c("Group 4 vs. Group 5", "Group 3 vs. Group 4",
"Group 2 vs. Group 3", "Group 1 vs. Group 2"))
# variance-stabilizing transformation
vsd <- vst(dds)
#vsd <- vsd[rownames(vsd) %in% lasso$ensembl_id,]
# PCA
pca <- plotPCA(vsd, intgroup = c("age_group"), returnData = TRUE)
percentVar <- round(100 * attr(pca, "percentVar"))
ggplot(pca, aes(PC1, PC2, color=age_group)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() +
theme_bw() +
theme(text = element_text(size = 16)) +
scale_colour_manual(values = c('#ffd166', '#ef476f', '#06d6a0', '#118ab2', '#073b4c'))
df_pca <- rownames_to_column(data.frame(t(assay(vsd)))) %>%
left_join(rownames_to_column(subset(data.frame(colData(vsd)), select = c('Age', 'sex', 'source_name',
'Ethnicity', 'age_group'))))
## Joining, by = "rowname"
pca <- prcomp(df_pca[, 2:(ncol(df_pca)-5)])
autoplot(pca, data = df_pca, colour = "age_group", x = 1, y = 2) +
theme_bw() +
theme(text = element_text(size = 20))
# Is another PC than 1 & 2 higher correlated to age?
cor(df_pca[, 15339], pca$x[, 1])
## [1] -0.5681123
cor(df_pca[, 15339], pca$x[, 2])
## [1] 0.1992682
cor(df_pca[, 15339], pca$x[, 3])
## [1] 0.1445767
cor(df_pca[, 15339], pca$x[, 4])
## [1] -0.1405156
cor(df_pca[, 15339], pca$x[, 5])
## [1] 0.02776604
ggplot(runs_info, aes(x = Age)) +
geom_histogram() +
facet_wrap(~source_name)
set.seed(142)
umap_fit <- umap(df_pca[, 2:(ncol(df_pca)-5)])
umap_df <- umap_fit$layout %>%
as.data.frame()%>%
dplyr::rename(UMAP1="V1", UMAP2="V2") %>%
mutate(sample=df_pca$rowname) %>%
inner_join(subset(data.frame(colData(vsd)),
select = c('Age', 'sex', 'source_name', 'Ethnicity', 'age_group')) %>%
rownames_to_column(var = "sample"))
## Joining, by = "sample"
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = age_group)) +
geom_point() +
theme_bw() +
theme(text = element_text(size = 20))
# save counts with variance-stabilizing transformation for MEFISTO
count_data_vst <- assay(vsd) %>%
as.data.frame() %>%
rownames_to_column(var = "ensembl_id") %>%
mutate(gene = rowData(vsd)$symbol) %>%
gather(key = "names", value = "counts", -c(ensembl_id, gene)) %>%
left_join(coldata, by = "names") %>%
filter(gene != "")
write.csv(count_data_vst, paste0(dir, "MOFA/Data/vst_counts.csv"), row.names = FALSE)
write.csv(count_data_vst, paste0(dir, "Data/vst_counts.csv"), row.names = FALSE)
# Mean counts per age group
mean_counts <- count_data_vst %>% group_by(ensembl_id, gene, age_group) %>%
summarize(counts = mean(counts)) %>%
ungroup()
## `summarise()` has grouped output by 'ensembl_id', 'gene'. You can override
## using the `.groups` argument.
#write.csv(mean_counts, paste0(dir, "Data/mean_counts.csv"), row.names = FALSE)
head(mean_counts)
## # A tibble: 6 × 4
## ensembl_id gene age_group counts
## <chr> <chr> <fct> <dbl>
## 1 ENSG00000000003 TSPAN6 1-15 8.74
## 2 ENSG00000000003 TSPAN6 16-26 9.08
## 3 ENSG00000000003 TSPAN6 27-60 9.12
## 4 ENSG00000000003 TSPAN6 61-85 9.01
## 5 ENSG00000000003 TSPAN6 86-96 9.46
## 6 ENSG00000000419 DPM1 1-15 9.48
groups = data.frame(transition = transitions,
groups = c("Group 1 vs. Group 2", "Group 2 vs. Group 3", "Group 3 vs. Group 4", "Group 4 vs. Group 5"))
expr_fc <- foldchanges %>% separate(transition, c("fc", "t0", "t1"), sep = "_", remove = FALSE) %>%
left_join(dplyr::rename(mean_counts, t0 = age_group, counts_t0 = counts)) %>%
left_join(dplyr::rename(mean_counts, t1 = age_group, counts_t1 = counts)) %>%
subset(select = c(ensembl_id, gene, transition, counts_t0, counts_t1, log2FoldChange, abs_log2_fc, padj)) %>%
left_join(mutate(p_0.05, selected = 'DE genes')) %>%
mutate(selected = factor(replace_na(selected, 'Non DE genes'), levels = c("DE genes", "Non DE genes"))) %>%
#arrange(desc(selected)) %>%
left_join(groups)
## Joining, by = c("ensembl_id", "gene", "t0")
## Joining, by = c("ensembl_id", "gene", "t1")
## Joining, by = c("gene", "transition", "log2FoldChange", "abs_log2_fc")
## Joining, by = "transition"
tail(expr_fc)
## ensembl_id gene transition counts_t0 counts_t1 log2FoldChange
## 59827 ENSG00000119632 IFI27L2 fc_27-60_61-85 5.659593 5.659593 0
## 59828 ENSG00000170270 GON7 fc_27-60_61-85 5.659593 5.659593 0
## 59829 ENSG00000204592 HLA-E fc_27-60_61-85 5.659593 5.659593 0
## 59830 ENSG00000223654 FLOT1 fc_27-60_61-85 5.659593 5.659593 0
## 59831 ENSG00000236428 PPP1R18 fc_27-60_61-85 5.659593 5.659593 0
## 59832 ENSG00000276023 DUSP14 fc_27-60_61-85 5.659593 5.659593 0
## abs_log2_fc padj updown selected groups
## 59827 0 1 <NA> Non DE genes Group 3 vs. Group 4
## 59828 0 1 <NA> Non DE genes Group 3 vs. Group 4
## 59829 0 1 <NA> Non DE genes Group 3 vs. Group 4
## 59830 0 1 <NA> Non DE genes Group 3 vs. Group 4
## 59831 0 1 <NA> Non DE genes Group 3 vs. Group 4
## 59832 0 1 <NA> Non DE genes Group 3 vs. Group 4
group.colors <- c("Non DE genes" = "grey", "DE genes" = "red")
ggplot(expr_fc, aes(x = counts_t0 , y = counts_t1 , color = selected)) +
geom_point(size = 0.5, alpha = 0.6) +
facet_wrap(~groups, nrow = 1) +
theme_bw() +
geom_smooth(method='lm', color = 'black', size = 0.5) +
scale_color_manual(values=group.colors) +
xlab("Mean expression in the first age group") +
ylab("Mean expression in \n the second age group") +
theme(text = element_text(size = 20)) +
xlim(0, 22) +
ylim(0,22) +
theme(legend.position="bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## `geom_smooth()` using formula = 'y ~ x'
Why are some of the selected genes so close to the black line?
expr_fc %>%
mutate(diff = abs((counts_t0 - counts_t1)/(counts_t1+0.1))) %>%
filter(selected == "True") %>%
arrange(diff) %>% head()
## [1] ensembl_id gene transition counts_t0 counts_t1
## [6] log2FoldChange abs_log2_fc padj updown selected
## [11] groups diff
## <0 Zeilen> (oder row.names mit Länge 0)
geneCounts <- plotCounts(dds, gene = "ENSG00000128606", intgroup = c("age_group"),
returnData = TRUE, normalized = TRUE)
ggplot(geneCounts, aes(x = age_group, y = count)) +
#geom_beeswarm(cex = 3) +
geom_violin(draw_quantiles = c(0.5), trim = TRUE) +
theme_bw()
EnhancedVolcano(expr_fc,
lab = expr_fc$gene,
x = 'log2FoldChange',
y = 'padj') +
facet_wrap(~transition, scales = "free")
df = data.frame(t(assay(gse))) %>%
rownames_to_column(var = "sample") %>%
left_join(rownames_to_column(subset(data.frame(colData(gse)), select = c("Age")), var = 'sample'))
## Joining, by = "sample"
model <- glmnet(subset(df, select = -c(sample, Age)), df$Age, alpha = 1, lambda = 1)
lasso <- data.frame(ensembl_id = dimnames(coef(model))[[1]], coefficient = matrix(coef(model))) %>%
filter(coefficient != 0) %>%
filter(ensembl_id != "(Intercept)") %>%
arrange(desc(abs(coefficient)))
lasso_plot <- lasso %>%
# add corresponding gene names to the ensembl ids
left_join(dplyr::rename(as_tibble(subset(rowData(gse), select = c(gene_id, gene_name))),
'ensembl_id' = 'gene_id')) %>%
filter(gene_name != "") %>%
group_by(gene_name) %>%
summarize(coefficient = max(abs(coefficient))) %>%
arrange(desc(abs(coefficient)))
## Joining, by = "ensembl_id"
lasso_plot %<>% mutate(gene_name = factor(gene_name, levels = lasso_plot$gene_name))
write.csv(lasso_plot, paste0(dir, "Data/lasso_genes.csv"), row.names = FALSE)
ggplot(lasso_plot, aes(x = gene_name, y = coefficient)) +
geom_bar(stat='identity') +
theme_bw() +
xlab('LASSO-selected genes') +
ylab('Absolute LASSO \n regression coefficient') +
theme(axis.text.x = element_text(angle = 90)) +
theme(text = element_text(size = 18))
sampleDists <- dist(t(assay(vsd[rownames(vsd) %in% lasso$ensembl_id,])))
sampleDistMatrix <- as.matrix(sampleDists)
annotation <- data.frame(subset(colData(vsd), select = c(age_group))) %>%
rownames_to_column(var = "sample")
age_groups <- data.frame("age_group" = c("1-15", "16-26", "27-60", "61-85", "86-96"),
"Groups" = c("Group 1 (1-15y)", "Group 2 (16-26y)", "Group 3 (27-60y)",
"Group 4 (61-85y)", "Group 5 (86-96y)"))
annotation %<>% left_join(age_groups) %>%
column_to_rownames(var = "sample") %>%
subset(select = c(Groups))
## Joining, by = "age_group"
colors <- colorRampPalette(rev(brewer.pal(9, "Reds")))(255)
annoCol <- c('#ffd166', '#ef476f', '#06d6a0', '#118ab2', '#073b4c')
names(annoCol) <- c("Group 1 (1-15y)", "Group 2 (16-26y)", "Group 3 (27-60y)", "Group 4 (61-85y)", "Group 5 (86-96y)")
annoCol <- list(Groups = annoCol)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
annotation_col = annotation, annotation_colors = annoCol,
clustering_method = "ward.D2",
col = colors, show_colnames = F, show_rownames = F, fontsize = 16, annotation_names_col = FALSE)
dend <- hclust(sampleDists, method = 'ward.D2')
plot(dend, labels = FALSE)
annotation <- data.frame(subset(colData(vsd), select = c(Age)))
ordering <- data.frame('cluster' = c(1, 2, 3, 4, 5, 6),
'ordered_cluster' = c(4, 2, 3, 5, 6, 1)) #to resort the clusters according to the covered age range
clusters <- data.frame(cluster = cutree(dend, 6)) %>%
rownames_to_column() %>%
#left_join(data.frame(subset(annotation, select = Age)) %>% rownames_to_column()) %>%
left_join(annotation %>% rownames_to_column()) %>%
mutate(Age = as.numeric(Age)) %>%
left_join(ordering) %>%
mutate(ordered_cluster = factor(ordered_cluster, levels = seq(1, 6, 1)))
## Joining, by = "rowname"
## Joining, by = "cluster"
ggplot(clusters, aes(x = ordered_cluster, y = Age)) +
geom_boxplot(fill = c("#ffd166", "#ef476f", "#06d6a0", "#06d6a0", "#118ab2", "#073b4c")) +
theme_bw() +
geom_hline(yintercept = c(15.5, 36.5, 60.5, 85.5), color="red", linetype="dashed") +
theme(text = element_text(size = 18)) +
xlab("Hierarchical cluster")
ggplot(clusters, aes(x = cluster, y = Age, color = cluster)) +
geom_point() +
theme_bw() +
geom_hline(yintercept = c(15.5, 36.5, 60.5, 85.5))+
theme(text = element_text(size = 20))
# LASSO regressions with varying penalty parameter lamda
model1 <- glmnet(subset(df, select = -c(sample, Age)), df$Age, alpha = 1, lambda = 0.25)
lasso1 <- data.frame(ensembl_id = dimnames(coef(model1))[[1]], coefficient = matrix(coef(model1))) %>%
filter(coefficient != 0) %>%
filter(ensembl_id != "(Intercept)") %>%
mutate(lambda = 0.25)
model2 <- glmnet(subset(df, select = -c(sample, Age)), df$Age, alpha = 1, lambda = 0.5)
lasso2 <- data.frame(ensembl_id = dimnames(coef(model2))[[1]], coefficient = matrix(coef(model2))) %>%
filter(coefficient != 0) %>%
filter(ensembl_id != "(Intercept)") %>%
mutate(lambda = 0.5)
model3 <- glmnet(subset(df, select = -c(sample, Age)), df$Age, alpha = 1, lambda = 2)
lasso3 <- data.frame(ensembl_id = dimnames(coef(model3))[[1]], coefficient = matrix(coef(model3))) %>%
filter(coefficient != 0) %>%
filter(ensembl_id != "(Intercept)") %>%
mutate(lambda = 2)
model4 <- glmnet(subset(df, select = -c(sample, Age)), df$Age, alpha = 1, lambda = 4)
lasso4 <- data.frame(ensembl_id = dimnames(coef(model4))[[1]], coefficient = matrix(coef(model4))) %>%
filter(coefficient != 0) %>%
filter(ensembl_id != "(Intercept)") %>%
mutate(lambda = 4)
all_models <- rbind(lasso1, lasso2, mutate(lasso, lambda = 1), lasso3, lasso4) %>%
group_by(lambda) %>%
group_split()
names(all_models) <- c("Lambda = 0.25", "Lambda = 0.5", "Lambda = 1", "Lambda = 2", "Lambda = 4")
all_models <- lapply(all_models, function(list) list$ensembl_id)
upset(fromList(all_models), order.by = "freq",
mainbar.y.label = "Intersections", sets.x.label = "LASSO-selected genes",
text.scale = c(4, 4, 3.5, 3.5, 4, 4), nintersects = 20,
keep.order= TRUE, sets = c("Lambda = 4", "Lambda = 2",
"Lambda = 1", "Lambda = 0.5", "Lambda = 0.25"))
# variance-stabilizing transformation
vsd <- vst(dds)
# filter for LASSO genes
keep <- rownames(vsd) %in% lasso$ensembl_id
vsd <- vsd[keep,]
# PCA
pca <- plotPCA(vsd, intgroup = c("age_group"), returnData = TRUE)
age_groups <- data.frame("age_group" = c("1-15", "16-26", "27-60", "61-85", "86-96"),
"Groups" = c("Group 1", "Group 2", "Group 3",
"Group 4", "Group 5"))
pca %<>% left_join(age_groups)
## Joining, by = "age_group"
percentVar <- round(100 * attr(pca, "percentVar"))
ggplot(pca, aes(-PC1, PC2, color=Groups)) +
geom_point(size=2) +
xlab(paste0("- PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
#coord_fixed() +
theme_bw() +
theme(text = element_text(size = 18)) +
scale_colour_manual(values = c('#ffd166', '#ef476f', '#06d6a0', '#118ab2', '#073b4c')) +
labs(color='Age Groups')
# calculate foldchanges between youngest and oldest age group
res <- results(dds, contrast=c("age_group", '86-96', '1-15'))
res$gene <- rowData(dds)$symbol
res <- subset(res, select = c(gene, log2FoldChange, pvalue, padj)) %>%
as.data.frame() %>%
rownames_to_column(var = "ensembl_id") %>%
filter(gene != "") %>%
drop_na(padj) %>%
filter(padj < 0.05) %>%
mutate(transition = "fc_1-15_86-96")
# split DE genes for various comparisons into list to visualize the intersections
transitions <- c("fc_16-26_27-60", "fc_61-85_86-96", "fc_1-15_86-96")
top_list <- subset(p_0.05, select = c(gene, transition)) %>%
filter(transition %in% c("fc_16-26_27-60", "fc_61-85_86-96")) %>%
bind_rows(subset(res, select = c(gene, transition))) %>%
mutate(transition = factor(transition, levels = transitions)) %>%
group_by(transition) %>%
group_split()
names(top_list) <- c("group2_vs_group3", "group4_vs_group5", "group1_vs_group5")
top_list <- lapply(top_list, function(list) list$gene)
upset(fromList(top_list), order.by = "freq",
mainbar.y.label = "Intersections", sets.x.label = "DE Genes",
text.scale = c(2.5, 2.5, 2, 2, 2.5, 2.5), nintersects = 10,
keep.order= TRUE, sets = c("group1_vs_group5", "group4_vs_group5", "group2_vs_group3"))
groups <- data.frame("group" = c("Upregulated DE genes", "Downregulated DE genes", "Non-DE genes", "Non-DE genes"),
"significant" = c("DE genes", "DE genes", "non DE genes", "non DE genes"),
"direction" = c("upregulated", "downregulated", "upregulated", "downregulated"))
res <- results(dds, contrast=c("age_group", '86-96', '1-15'))
res$gene <- rowData(dds)$symbol
res <- subset(res, select = c(gene, log2FoldChange, pvalue, padj)) %>%
as.data.frame() %>%
rownames_to_column(var = "ensembl_gene_id") %>%
filter(gene != "") %>%
drop_na(padj) %>%
mutate(significant = ifelse(padj < 0.05, "DE genes", "non DE genes")) %>%
mutate(direction = ifelse(log2FoldChange > 0, "upregulated", "downregulated")) %>%
left_join(groups)
## Joining, by = c("significant", "direction")
# Query gene information from biomart
genes <- unique(res$ensembl_gene_id)
mart <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
gene_lengths <- getBM(filters="ensembl_gene_id",
attributes=c("ensembl_gene_id",
"chromosome_name", "start_position",
"end_position", "external_gene_name"),
values=c(genes), mart=mart) %>%
data.frame() %>%
group_by(ensembl_gene_id) %>%
summarize(chr = dplyr::first(chromosome_name), start = dplyr::first(start_position),
end = dplyr::first(end_position), gene_symbol = dplyr::first(external_gene_name)) %>%
mutate(gene_length = end - start) %>%
drop_na(gene_length) %>%
dplyr::filter(chr %in% as.character(seq(1,22,1))) %>%
subset(select = c(ensembl_gene_id, gene_length)) %>%
right_join(res)
## Joining, by = "ensembl_gene_id"
ggplot(gene_lengths, aes(x = group, y = log(gene_length), fill = group)) +
geom_boxplot() +
theme_bw() +
xlab("") + ylab("Logarithmic gene length (bp)") +
theme(text = element_text(size = 22)) +
theme(axis.text.x = element_blank(), axis.ticks = element_blank()) +
scale_fill_manual(values = c('#1565c0', '#999999', '#c62828'))
## Warning: Removed 1533 rows containing non-finite values (`stat_boxplot()`).
# T-test for downregulated genes
t.test(dplyr::filter(gene_lengths, group == "Downregulated DE genes")$gene_length,
dplyr::filter(gene_lengths, group == "Non-DE genes")$gene_length,
mu = 0, alternative = "less")
##
## Welch Two Sample t-test
##
## data: dplyr::filter(gene_lengths, group == "Downregulated DE genes")$gene_length and dplyr::filter(gene_lengths, group == "Non-DE genes")$gene_length
## t = -11.247, df = 9335.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -23191.84
## sample estimates:
## mean of x mean of y
## 52805.58 79970.85
# T-test for upregulated genes
t.test(dplyr::filter(gene_lengths, group == "Upregulated DE genes")$gene_length,
dplyr::filter(gene_lengths, group == "Non-DE genes")$gene_length,
mu = 0, alternative = "greater")
##
## Welch Two Sample t-test
##
## data: dplyr::filter(gene_lengths, group == "Upregulated DE genes")$gene_length and dplyr::filter(gene_lengths, group == "Non-DE genes")$gene_length
## t = 4.7429, df = 8070.7, p-value = 1.071e-06
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 8997.297 Inf
## sample estimates:
## mean of x mean of y
## 93745.94 79970.85
fc_length <- foldchanges %>%
left_join(mutate(subset(foldchanges_filtered, select = c(ensembl_id, transition, log2FoldChange)),
DE_group = ifelse(log2FoldChange > 0, "DE genes upregulated", "DE genes downregulated"))) %>%
mutate(group = replace_na(DE_group, "non DE genes")) %>%
#mutate(significant = ifelse(padj < 0.05, "DE genes", "non DE genes")) %>%
dplyr::rename(ensembl_gene_id = ensembl_id)
## Joining, by = c("ensembl_id", "transition", "log2FoldChange")
age_groups = data.frame(transition = c("fc_1-15_16-26", "fc_16-26_27-60", "fc_27-60_61-85",
"fc_61-85_86-96"),
age_transition = c("Group 1 vs. Group 2", "Group 2 vs. Group 3", "Group 3 vs. Group 4", "Group 4 vs. Group 5"))
# Query gene information from biomart
genes <- unique(fc_length$ensembl_gene_id)
mart <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
gene_lengths <- getBM(filters="ensembl_gene_id",
attributes=c("ensembl_gene_id",
"chromosome_name", "start_position",
"end_position", "external_gene_name"),
values=c(genes), mart=mart) %>%
data.frame() %>%
group_by(ensembl_gene_id) %>%
summarize(chr = dplyr::first(chromosome_name), start = dplyr::first(start_position),
end = dplyr::first(end_position), gene_symbol = dplyr::first(external_gene_name)) %>%
mutate(gene_length = end - start) %>%
dplyr::filter(chr %in% as.character(seq(1,22,1))) %>%
subset(select = c(ensembl_gene_id, gene_length)) %>%
right_join(fc_length) %>%
left_join(age_groups)
## Joining, by = "ensembl_gene_id"
## Joining, by = "transition"
ggplot(gene_lengths, aes(x = DE_group, y = log(gene_length), fill = DE_group)) +
geom_boxplot() +
facet_wrap(~age_transition) +
theme_bw() +
xlab("") + ylab("log gene length") +
theme(axis.text.x = element_blank(), axis.ticks = element_blank())
## Warning: Removed 6132 rows containing non-finite values (`stat_boxplot()`).
# T-test for downregulated genes
#t.test(dplyr::filter(gene_lengths, transition == "Group 4 vs. Group 5", DE_group == "DE genes downregulated")$gene_length,
# dplyr::filter(gene_lengths, transition == "Group 4 vs. Group 5", DE_group == "non DE genes")$gene_length,
# mu = 0, alternative = "less")
# T-test for upregulated genes
#t.test(dplyr::filter(gene_lengths, transition == "Group 4 vs. Group 5", DE_group == "DE genes upregulated")$gene_length,
# dplyr::filter(gene_lengths, transition == "Group 4 vs. Group 5", DE_group == "non DE genes")$gene_length,
# mu = 0, alternative = "greater")